https://www.tidytextmining.com/
library(rtweet)
library(tidyverse)
library(tidytext)
library(wordcloud2)
search_tweets
tweet_collection <- search_tweets("lego star wars", n=10000, lang = "en")
Downloading [>----------------------------------------] 2%
Downloading [>----------------------------------------] 3%
Downloading [=>---------------------------------------] 4%
Downloading [=>---------------------------------------] 5%
Downloading [=>---------------------------------------] 6%
Downloading [==>--------------------------------------] 7%
Downloading [==>--------------------------------------] 8%
Downloading [===>-------------------------------------] 9%
Downloading [===>-------------------------------------] 10%
Downloading [====>------------------------------------] 11%
Downloading [====>------------------------------------] 12%
Downloading [====>------------------------------------] 13%
Downloading [=====>-----------------------------------] 14%
Downloading [=====>-----------------------------------] 15%
Downloading [======>----------------------------------] 16%
Downloading [======>----------------------------------] 17%
Downloading [======>----------------------------------] 18%
Downloading [=======>---------------------------------] 19%
Downloading [=======>---------------------------------] 20%
Downloading [========>--------------------------------] 21%
Downloading [========>--------------------------------] 22%
Downloading [========>--------------------------------] 23%
Downloading [=========>-------------------------------] 24%
Downloading [=========>-------------------------------] 25%
Downloading [==========>------------------------------] 26%
Downloading [==========>------------------------------] 27%
Downloading [==========>------------------------------] 28%
Downloading [===========>-----------------------------] 29%
Downloading [===========>-----------------------------] 30%
Downloading [============>----------------------------] 31%
Downloading [============>----------------------------] 32%
Downloading [=============>---------------------------] 33%
Downloading [=============>---------------------------] 34%
Downloading [=============>---------------------------] 35%
Downloading [==============>--------------------------] 36%
Downloading [==============>--------------------------] 37%
Downloading [===============>-------------------------] 38%
Downloading [===============>-------------------------] 39%
Downloading [===============>-------------------------] 40%
Downloading [================>------------------------] 41%
Downloading [================>------------------------] 42%
Downloading [=================>-----------------------] 43%
Downloading [=================>-----------------------] 44%
Downloading [=================>-----------------------] 45%
Downloading [==================>----------------------] 46%
Downloading [==================>----------------------] 47%
Downloading [===================>---------------------] 48%
Downloading [===================>---------------------] 49%
Downloading [===================>---------------------] 50%
Downloading [====================>--------------------] 51%
Downloading [====================>--------------------] 52%
Downloading [=====================>-------------------] 53%
Downloading [=====================>-------------------] 54%
Downloading [======================>------------------] 55%
Downloading [======================>------------------] 56%
Downloading [======================>------------------] 57%
Downloading [=======================>-----------------] 58%
Downloading [=======================>-----------------] 59%
Downloading [========================>----------------] 60%
Downloading [========================>----------------] 61%
Downloading [========================>----------------] 62%
Downloading [=========================>---------------] 63%
Downloading [=========================>---------------] 64%
Downloading [==========================>--------------] 65%
Downloading [==========================>--------------] 66%
Downloading [==========================>--------------] 67%
Downloading [===========================>-------------] 68%
Downloading [===========================>-------------] 69%
Downloading [============================>------------] 70%
Downloading [============================>------------] 71%
Downloading [=============================>-----------] 72%
Downloading [=============================>-----------] 73%
Downloading [=============================>-----------] 74%
Downloading [==============================>----------] 75%
Downloading [==============================>----------] 76%
Downloading [===============================>---------] 77%
Downloading [===============================>---------] 78%
Downloading [===============================>---------] 79%
Downloading [================================>--------] 80%
Downloading [================================>--------] 81%
Downloading [=================================>-------] 82%
Downloading [=================================>-------] 83%
Downloading [=================================>-------] 84%
Downloading [==================================>------] 85%
Downloading [==================================>------] 86%
Downloading [===================================>-----] 87%
Downloading [===================================>-----] 88%
Downloading [===================================>-----] 89%
Downloading [====================================>----] 90%
Downloading [====================================>----] 91%
Downloading [=====================================>---] 92%
Downloading [=====================================>---] 93%
tweet_collection.orig <- tweet_collection
tweet_collection <- tweet_collection %>%
filter(is_retweet == "FALSE")
tweet_collection
tweets_by_tweeter <- tweet_collection %>%
group_by(screen_name) %>%
mutate(line = row_number()) %>%
ungroup()
tweets_by_tweeter %>%
count(screen_name, sort = TRUE)
# glimpse(tweets_by_tweeter)
{r}
bad_hashtags <- tweets_by_tweeter %>%
select(status_id, hashtags) %>%
unnest() %>%
# filter(str_detect(hashtags, regex("electionnight", ignore_case = TRUE)) |
# # str_detect(hashtags, regex("election20", ignore_case = TRUE)) |
# str_detect(hashtags, "2020")) %>%
distinct(status_id)
bad_hashtags
# group_by(hashtags) %>%
# summarise(tot_status_id = n()) %>%
# arrange(-tot_status_id) # ElectionNight / Election2020
# count(hashtags, sort = TRUE)
tweets_by_tweeter <- tweets_by_tweeter %>%
anti_join(bad_hashtags)
tweets_by_tweeter
"Because we have kept text such as hashtags and usernames in the dataset, we can’t use a simple anti_join() to remove stop words. Instead, we can take the approach shown in the filter() line that uses str_detect() from the stringr package. – https://www.tidytextmining.com/twitter.html
tweets_tokenized <- tweets_by_tweeter %>%
select(text, screen_name, line) %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"),
str_detect(word, "[a-z]"))
Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
tweets_tokenized
head(stopwordslangs)
tweets_tokenized %>%
count(word, sort = TRUE, name = "freq") %>%
filter(!str_detect(word, "^\\@")) %>%
anti_join(stopwordslangs) # anti_join(tidytext::get_stopwords())
Joining, by = "word"
frequency <- tweets_tokenized %>%
group_by(screen_name) %>%
count(word, sort = TRUE) %>%
left_join(tweets_tokenized %>%
group_by(screen_name) %>%
summarise(total = n())) %>%
mutate(freq = n/total)
`summarise()` ungrouping output (override with `.groups` argument)
Joining, by = "screen_name"
frequency
"This is a nice and tidy data frame but we would actually like to plot those frequencies on the x- and y-axes of a plot, so we will need to use spread() from tidyr make a differently shaped data frame. – https://www.tidytextmining.com/twitter.html
pivot_wider
frequency <- frequency %>%
select(screen_name, word, freq) %>%
pivot_wider(names_from = screen_name, values_from = freq) #, values_fill = 0)
frequency